Data source can be found here
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.pyplot import figaspect
from mpl_toolkits.axes_grid1 import make_axes_locatable
import geopandas as gpd
import seaborn as sns
from scipy.stats.stats import pearsonr
mpl.rcParams.update({'font.size': 14})
#Reading all 11 csv files
files = sorted(glob.glob('../CHSI_dataset/*.csv'))
DATA_ELEMENT_DESCRIPTION = pd.read_csv(files[0])
DEFINED_DATA_VALUE = pd.read_csv(files[1])
DEMOGRAPHICS = pd.read_csv(files[2])
HEALTHY_PEOPLE_2010 = pd.read_csv(files[3])
LEADING_CAUSES_OF_DEATH = pd.read_csv(files[4])
MEASURES_OF_BIRTH_AND_DEATH = pd.read_csv(files[5])
PREVENTIVE_SERVICES_USE = pd.read_csv(files[6])
RELATIVE_HEALTH_IMPORTANCE = pd.read_csv(files[7])
RISK_FACTORS_AND_ACCESS_TO_CARE = pd.read_csv(files[8])
SUMMARY_MEASURES_OF_HEALTH = pd.read_csv(files[9])
VULNERABLE_POPS_AND_ENV_HEALTH = pd.read_csv(files[10])
#Auxiliary files
DATA_ELEMENT_DESCRIPTION
DEFINED_DATA_VALUE
HEALTHY_PEOPLE_2010
LEADING_CAUSES_OF_DEATH.head()
#Outlier column name
DATA_ELEMENT_DESCRIPTION.update(
DATA_ELEMENT_DESCRIPTION.replace('C_Ot_homicide','C_Ot_Homicide'))
LEADING_CAUSES_OF_DEATH.rename(columns={'C_Ot_homicide':'C_Ot_Homicide'}, inplace=True)
#Functions to describe acronyms
def description(param):
if param in list(DATA_ELEMENT_DESCRIPTION.COLUMN_NAME):
description_value = DATA_ELEMENT_DESCRIPTION[DATA_ELEMENT_DESCRIPTION.
COLUMN_NAME==param].DESCRIPTION.tolist()[0]
else:
description_value = ''
return description_value
def short_description(param):
list_of_removing = ['County data,', 'death measures,', 'birth measures,', ',']
result = description(param)
for i in list_of_removing:
if result.__contains__(i):
result = result.replace(i,'')
return result
def short_race_description(param):
list_of_removing = ['Black', 'Hispanic','White', 'other',
'County data,', 'death measures,', 'birth measures,', ',']
result = description(param)
for i in list_of_removing:
if result.__contains__(i):
result = result.replace(i,'')
return result
#Common columns in all worksheets
county_info = ['State_FIPS_Code', 'County_FIPS_Code', 'CHSI_County_Name',
'CHSI_State_Name', 'CHSI_State_Abbr', 'Strata_ID_Number']
#Joining files
worksheets = [DEMOGRAPHICS,
LEADING_CAUSES_OF_DEATH,
MEASURES_OF_BIRTH_AND_DEATH,
PREVENTIVE_SERVICES_USE,
RELATIVE_HEALTH_IMPORTANCE,
RISK_FACTORS_AND_ACCESS_TO_CARE,
SUMMARY_MEASURES_OF_HEALTH,
VULNERABLE_POPS_AND_ENV_HEALTH]
#Replacing not available/reported data by zeros
nan_values = [-1111, -1111.1, -1, -9999, -2222, -2222.2, -2]
for worksheet in worksheets:
nda =[]
for i,nan_value in enumerate(nan_values):
nda.append(worksheet.loc[:,:]!=nan_value)
worksheet.update(worksheet.where(nda[i]).fillna(0))
#Time dependent data should be identified and normalized
time_dependent_worksheets = [LEADING_CAUSES_OF_DEATH,
MEASURES_OF_BIRTH_AND_DEATH,
PREVENTIVE_SERVICES_USE,
VULNERABLE_POPS_AND_ENV_HEALTH]
time_spans = ['1994-2003', '1999-2003', '2001-2003']
time_span_convert = [7,4,2]
for worksheet in time_dependent_worksheets:
for i in list(worksheet.columns):
if i.__contains__('Time_Span'):
worksheet[i].replace(time_spans, time_span_convert, inplace=True)
worksheet.rename(columns={i: 'Time'}, inplace=True)
for worksheet in time_dependent_worksheets:
for i in list(worksheet.columns):
if worksheet.columns.get_loc(i) < worksheet.columns.get_loc('Time') and i not in county_info:
worksheet[i]=worksheet[i]/worksheet['Time']
#Merging all clean data files
df = worksheets[0]
for name in worksheets[1:]:
df = df.merge(name, on=county_info.append('Time'), how='outer', sort=True).fillna(0)
df.head()
observation = list(df.columns)
description_list = [description(i) for i in observation]
def observ_keyword(variable):
results = []
for i, j in enumerate(description_list):
if variable in str(j):
results.append(observation[i])
return results
filter_percentile = list(description(i) for i in list(observ_keyword('percentile')))
filter_CI = list(description(i) for i in list(observ_keyword('Confidence interval')))
filter_fav = list(description(i) for i in list(observ_keyword('Favorable indicator')))
filters = list(observ_keyword('Favorable indicator')
+ observ_keyword('Confidence interval')
+ observ_keyword('percentile'))
df = df.drop(columns=filters)
#Neutral columns
for i in list(df.columns):
if len(np.unique(df[i]))==1 and i!='D_Ot_HIV':
df=df.drop(columns=i)
df.head()
df.describe().transpose()
To visualize the health indices throughout the country, I merge the map shapefile and data to us_merge dataframes. Because Alaska and Hawaii are further away and out of proportion I merged them separately. US_plot function creates the US map with any numeric variables in the data --default color is blue.
us_map = gpd.read_file('../US-map/UScounties.shp')
us_map_land = us_map[ (us_map['STATE_NAME']!='Alaska')
&(us_map['STATE_NAME']!='Hawaii')]
us_map_Alaska = us_map[us_map['STATE_NAME']=='Alaska']
us_map_Hawaii = us_map[us_map['STATE_NAME']=='Hawaii']
#Merging map and data:
df_land = us_map_land.set_index('NAME').join(df.set_index('CHSI_County_Name'))
df_Alaska = us_map_Alaska.set_index('NAME').join(df.set_index('CHSI_County_Name'))
df_Hawaii = us_map_Hawaii.set_index('NAME').join(df.set_index('CHSI_County_Name'))
def min_max_col(variable):
minimum = min(df_land[variable].min(),
df_Alaska[variable].min(), df_Hawaii[variable].min())
maximum = max(df_land[variable].max(),
df_Alaska[variable].max(), df_Hawaii[variable].max())
return minimum, maximum
def US_plot(variable, color='Blues'):
h, w = figaspect(1.)
mn, mx = min_max_col(variable)
fig, ax1 = plt.subplots(1, figsize=(w*10, h*10))
ax1.axis('off')
ax1.set_title(description(variable), fontsize=40)
divider = make_axes_locatable(ax1)
cax = divider.append_axes('right', size='2%', pad=0.1)
cax.tick_params(labelsize=25)
df_land.plot(column=variable, cmap=color,
linewidth=0.8, ax=ax1,
edgecolor='0.8', vmin=mn ,vmax=mx,
legend=True, cax=cax)
left, bottom, width, height = [0.15, 0.22, 0.24, 0.3]
ax2 = fig.add_axes([left, bottom, width, height])
ax2.axis('off')
ax2.set_title('Alaska', fontsize=25)
df_Alaska.plot(column=variable, cmap=color, vmin=mn ,vmax=mx,
linewidth=0.8, ax=ax2, edgecolor='0.6')
left, bottom, width, height = [0.35, 0.3, 0.2, 0.05]
ax3 = fig.add_axes([left, bottom, width, height])
ax3.axis('off')
ax3.set_title('Hawaii', fontsize=25)
df_Hawaii.plot(column =variable, cmap=color, vmin=mn ,vmax=mx,
linewidth=0.8, ax=ax3, edgecolor='0.6')
US_plot('Suicide', 'Greys')
US_plot('CHD', 'Reds')
Birth_Death_list = list(np.intersect1d(df.columns, MEASURES_OF_BIRTH_AND_DEATH.columns))
Birth = []
Death = []
for i in Birth_Death_list:
if i not in county_info:
if description(i).__contains__('birth'):
Birth.append(i)
else:
Death.append(i)
Death_Mean = {'Cause_of_Death':Death, 'Mean_Death':[np.mean(df[i]) for i in Death]}
Birth_Mean = {'Birth':Birth, 'Mean_Birth':[np.mean(df[i]) for i in Birth]}
df_death_mean = pd.DataFrame(Death_Mean)
df_death_mean = df_death_mean.drop(df_death_mean[df_death_mean['Cause_of_Death'] =='Total_Deaths'].index, axis=0)
df_death_mean = df_death_mean.sort_values(by=['Mean_Death'], ascending=False)
df_birth_mean = pd.DataFrame(Birth_Mean)
df_birth_mean = df_birth_mean.drop(df_birth_mean[df_birth_mean['Birth'] =='Total_Births'].index, axis=0)
df_birth_mean = df_birth_mean.sort_values(by=['Mean_Birth'], ascending=False)
fig, ax = plt.subplots(1, figsize=(5, 5))
labels=list(short_description(i) for i in df_death_mean.Cause_of_Death)
plt.barh(labels, df_death_mean.Mean_Death, color='brown', height=0.9)
plt.xlabel('Mean of death in each county', fontsize=12)
plt.title('Cause of death rates', y=0.845, fontsize=14,
bbox=dict(facecolor='none', edgecolor='k', boxstyle='round,pad=0.5'))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
fig, ax = plt.subplots(1, figsize=(4, 4))
labels=list(short_description(i) for i in df_birth_mean.Birth)
plt.barh(labels, df_birth_mean.Mean_Birth, color='b', height=0.9)
plt.xlabel('Mean of birth on each county', fontsize=12)
plt.title('Birth conditions', y=0.872, fontsize=14,
bbox=dict(facecolor='none', edgecolor='k', boxstyle='round,pad=0.5'))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
Death_race_list = list(np.intersect1d(df.columns, LEADING_CAUSES_OF_DEATH.columns))
Black = []
Hispanic = []
White = []
other = []
for i in Death_race_list:
if i not in county_info:
if description(i).__contains__('Black'):
Black.append(i)
elif description(i).__contains__('Hispanic'):
Hispanic.append(i)
elif description(i).__contains__('White'):
White.append(i)
else:
other.append(i)
Black_up_24=[]
Black_25=[]
Hispanic_up_24=[]
Hispanic_25=[]
White_up_24=[]
White_25=[]
other_up_24=[]
other_25=[]
for i in Black:
if description(i).__contains__('25-44') or description(i).__contains__('45-64') or description(i).__contains__('65+'):
Black_25.append(i)
else:
Black_up_24.append(i)
for i in Hispanic:
if description(i).__contains__('25-44') or description(i).__contains__('45-64') or description(i).__contains__('65+'):
Hispanic_25.append(i)
else:
Hispanic_up_24.append(i)
for i in White:
if description(i).__contains__('25-44') or description(i).__contains__('45-64') or description(i).__contains__('65+'):
White_25.append(i)
else:
White_up_24.append(i)
for i in other:
if description(i).__contains__('25-44') or description(i).__contains__('45-64') or description(i).__contains__('65+'):
other_25.append(i)
else:
other_up_24.append(i)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(25, 25), sharey=True)
fig.subplots_adjust(hspace=-0.1)
cs=cm.Pastel1(np.arange(100))
mpl.rcParams['font.size'] = 15
fig.suptitle('Cause of Death, age under 25: Different Races', y=0.95, fontsize=30)
ax1.set_title('Black', fontsize=25)
labels=list(short_race_description(i) for i in Black_up_24)
ax1.pie(list(np.mean(df[i]) for i in Black_up_24),
labels=labels, colors=cs, autopct='%1.1f%%', radius=0.8)
ax2.set_title('Hispanic', fontsize=25)
labels=list(short_race_description(i) for i in Hispanic_up_24)
ax2.pie(list(np.mean(df[i]) for i in Hispanic_up_24),
labels=labels, colors=cs, autopct='%1.1f%%', radius=0.8)
ax3.set_title('White', fontsize=25)
labels=list(short_race_description(i) for i in White_up_24)
ax3.pie(list(np.mean(df[i]) for i in White_up_24),
labels=labels, colors=cs, autopct='%1.1f%%', radius=0.8)
ax4.set_title('Other', fontsize=25)
labels=list(short_race_description(i) for i in other_up_24)
ax4.pie(list(np.mean(df[i])*2 for i in other_up_24),
labels=labels, colors=cs, autopct='%1.1f%%', radius=0.8)
plt.show()
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(25, 25), sharey=True)
fig.subplots_adjust(hspace=-0.1)
cs=cm.Set3(np.arange(100))
mpl.rcParams['font.size'] = 15
fig.suptitle('Cause of Death, age 25+: Different Races', y=0.95, fontsize=30)
ax1.set_title('Black', fontsize=25)
labels=list(short_race_description(i) for i in Black_25)
ax1.pie(list(np.mean(df[i]) for i in Black_25),
labels=labels, colors=cs, autopct='%1.1f%%', radius=0.8)
ax2.set_title('Hispanic', fontsize=25)
labels=list(short_race_description(i) for i in Hispanic_25)
ax2.pie(list(np.mean(df[i]) for i in Hispanic_25),
labels=labels, colors=cs, autopct='%1.1f%%', radius=0.8)
ax3.set_title('White', fontsize=25)
labels=list(short_race_description(i) for i in White_25)
ax3.pie(list(np.mean(df[i]) for i in White_25),
labels=labels, colors=cs, autopct='%1.1f%%', radius=0.8)
ax4.set_title('Other', fontsize=25)
labels=list(short_race_description(i) for i in other_25)
ax4.pie(list(np.mean(df[i]) for i in other_25),
labels=labels, colors=cs, autopct='%1.1f%%', radius=0.8)
plt.show()
df.Mammogram
US_plot('Mammogram')
US_plot('Brst_Cancer')
The following table shows the Pearson correlation between variables:
df.corr(method ='pearson')
Well, this is too much information about our big dataframe! To know about the important correlated variables we need more cleaning;
I'll make another small dataframe with 3 columns: column one and two--> correlated pairs and column 3 the magnitude of their correlations. To make it easier to see the highly correlated parameters I sort them. I also defined a threshold=0.7 to cut low-correlated ones.
not_helping_columns = ['Sulfur_Dioxide_Ind',
'Nitrogen_Dioxide_Ind', 'Lead_Ind', 'Carbon_Monoxide_Ind', 'Particulate_Matter_Ind', 'Time']
attrs = df.drop(columns=not_helping_columns).corr()
threshold = 0.6
high_corrs = (attrs[abs(attrs) > threshold][attrs != 1.0]).unstack().dropna().to_dict()
df_corrs = pd.DataFrame(list((sorted(key)[0], sorted(key)[1], high_corrs [key]) for key in high_corrs),
columns=['attribute_1','attribute_2', 'correlation'])
df_corrs = df_corrs.iloc[abs(df_corrs['correlation']).argsort()[::-1]].drop_duplicates().reset_index(drop=True)
df_corrs.head(20)
Alright! It looks better!
# If we want to make a df with tuples of correlated pairs:
# df_corrs = pd.DataFrame(list(set([(tuple(sorted(key)), high_corrs [key])
# for key in high_corrs])), columns=['attribute_pair', 'correlation'])
#Here we plot the top 6 most correlated parameters:
colors = ['b', 'g', 'r', 'c', 'm', 'y']
markers = ['H', 'v', 's', 'P', '*', 'D']
fig, axs = plt.subplots(3,2, figsize=(12,15), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace=0.4, wspace=0.5)
fig.suptitle('Plots of highly-correlated parameters', y=0.97, fontsize=30)
axs = axs.ravel()
for i in range(6):
axs[i].scatter(df[df_corrs.loc[i+1,'attribute_1']], df[df_corrs.loc[i+1,'attribute_2']],
marker=markers[i], color=colors[i])
axs[i].set_xlabel(short_description(df_corrs.loc[i+1,'attribute_1']), fontsize=14)
axs[i].set_ylabel(short_description(df_corrs.loc[i+1,'attribute_2']), fontsize=14)
axs[i].tick_params(axis='x', labelsize=10)
axs[i].tick_params(axis='y', labelsize=10)
for i in Death:
print(df_corrs[df_corrs['attribute_1']==i].head(10))
#Now the most correlated parameters with heart diseas:
num_plots = 16
colors = cm.tab20(np.linspace(0, 1, num_plots))
fig, axs = plt.subplots(int(num_plots/2),2, figsize=(12,40), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace=0.4, wspace=0.5)
fig.suptitle('Plots of most correlated parameters with heart disease', y=0.92, fontsize=30)
axs = axs.ravel()
corr_params = list(df_corrs[df_corrs['attribute_1']=='CHD']['attribute_2'])
for i in range(num_plots):
axs[i].scatter(df['CHD'], df[corr_params[i]],
marker='*', color=colors[i])
axs[i].set_xlabel(short_description('CHD'), fontsize=14)
axs[i].set_ylabel(short_description(corr_params[i]), fontsize=14)
axs[i].tick_params(axis='x', labelsize=10)
axs[i].tick_params(axis='y', labelsize=10)
#Now the most correlated parameters with breast cancer:
num_plots = 10
colors = cm.tab20(np.linspace(0, 1, num_plots))
fig, axs = plt.subplots(int(num_plots/2),2, figsize=(12,25), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace=0.4, wspace=0.5)
fig.suptitle('Plots of most correlated parameters with breast cancer', y=0.92, fontsize=30)
axs = axs.ravel()
corr_params = list(df_corrs[df_corrs['attribute_1']=='Brst_Cancer']['attribute_2'])
for i in range(num_plots):
axs[i].scatter(df['CHD'], df[corr_params[i]],
marker='*', color=colors[i])
axs[i].set_xlabel(short_description('Brst_Cancer'), fontsize=14)
axs[i].set_ylabel(short_description(corr_params[i]), fontsize=14)
axs[i].tick_params(axis='x', labelsize=10)
axs[i].tick_params(axis='y', labelsize=10)
Okk! Correlated parameters are great and can be discussed in statistical inference section. Now how about looking at the data from States point of view:
#A new dataframe based on States:
df_state = df.groupby('CHSI_State_Abbr').sum().reset_index()
df_state.head()
sns.set(style='whitegrid')
fig, ax = plt.subplots(1, figsize=(20, 10))
plt.bar(df_state['CHSI_State_Abbr'], df_state['Homicide'], color='b')
plt.xlim(-1, len(df_state.index))
plt.ylabel('The sum of homicide in all the counties of each state', fontsize=16)
plt.xticks(fontsize=12)
plt.show()
sns.reset_defaults()
us_state = gpd.read_file('../US-States/states.shp')
us_state_land = us_state[ (us_state['STATE_NAME']!='Alaska')
&(us_state['STATE_NAME']!='Hawaii')]
us_state_Alaska = us_state[us_state['STATE_NAME']=='Alaska']
us_state_Hawaii = us_state[us_state['STATE_NAME']=='Hawaii']
us_state.head()
#Merging map and data:
df_state_land = us_state_land.set_index('STATE_ABBR').join(df_state.set_index('CHSI_State_Abbr'))
df_state_Alaska = us_state_Alaska.set_index('STATE_ABBR').join(df_state.set_index('CHSI_State_Abbr'))
df_state_Hawaii = us_state_Hawaii.set_index('STATE_ABBR').join(df_state.set_index('CHSI_State_Abbr'))
def min_max_col2(variable):
minimum = min(df_state_land[variable].min(),
df_state_Alaska[variable].min(), df_state_Hawaii[variable].min())
maximum = max(df_state_land[variable].max(),
df_state_Alaska[variable].max(), df_state_Hawaii[variable].max())
return minimum, maximum
def US_state_plot(variable, color='Blues'):
h, w = figaspect(1.)
mn, mx = min_max_col2(variable)
fig, ax1 = plt.subplots(1, figsize=(w*10, h*10))
ax1.axis('off')
ax1.set_title(short_description(variable), fontsize=40)
divider = make_axes_locatable(ax1)
cax = divider.append_axes('right', size='2%', pad=0.1)
cax.tick_params(labelsize=25)
df_state_land.plot(column=variable, cmap=color,
linewidth=0.8, ax=ax1,
edgecolor='0.8', vmin=mn ,vmax=mx,
legend=True, cax=cax)
left, bottom, width, height = [0.15, 0.22, 0.24, 0.3]
ax2 = fig.add_axes([left, bottom, width, height])
ax2.axis('off')
ax2.set_title('Alaska', fontsize=25)
df_state_Alaska.plot(column=variable, cmap=color, vmin=mn ,vmax=mx,
linewidth=0.8, ax=ax2, edgecolor='0.6')
left, bottom, width, height = [0.35, 0.3, 0.2, 0.05]
ax3 = fig.add_axes([left, bottom, width, height])
ax3.axis('off')
ax3.set_title('Hawaii', fontsize=25)
df_state_Hawaii.plot(column =variable, cmap=color, vmin=mn ,vmax=mx,
linewidth=0.8, ax=ax3, edgecolor='0.6')
df_state_land['Homicide_per_capita'] = df_state_land['Homicide'].div(df_state_land['Population_Size'])
df_state_Alaska['Homicide_per_capita'] = df_state_Alaska['Homicide'].div(df_state_Alaska['Population_Size'])
df_state_Hawaii['Homicide_per_capita'] = df_state_Hawaii['Homicide'].div(df_state_Hawaii['Population_Size'])
new_item = pd.DataFrame({'COLUMN_NAME':['Homicide_per_capita'], 'DESCRIPTION':['Homicide per capita']})
DATA_ELEMENT_DESCRIPTION = DATA_ELEMENT_DESCRIPTION.append(new_item, sort=False)
US_state_plot('Homicide_per_capita', 'OrRd')
df_state_land['Suicide_per_capita'] = df_state_land['Suicide']/df_state_land['Population_Size']
df_state_Alaska['Suicide_per_capita'] = df_state_Alaska['Suicide']/df_state_Alaska['Population_Size']
df_state_Hawaii['Suicide_per_capita'] = df_state_Hawaii['Suicide']/df_state_Hawaii['Population_Size']
new_item = pd.DataFrame({'COLUMN_NAME':['Suicide_per_capita'], 'DESCRIPTION':['Suicide per capita']})
DATA_ELEMENT_DESCRIPTION = DATA_ELEMENT_DESCRIPTION.append(new_item, sort=False)
US_state_plot('Suicide_per_capita', 'BuPu')
#Relationship between total death, being unmarried and being uninsured:
sns.set(style='whitegrid')
fig, ax = plt.subplots(1, figsize=(6, 5))
cmap = sns.cubehelix_palette(light=1, as_cmap=True)
ax = sns.scatterplot(x='Unemployed', y='Uninsured', size='Total_Deaths', hue='Total_Deaths',
sizes=(20, 200), palette=cmap, data=df_state)
ax.set_xlim(0)
ax.set_ylim(0)
sns.reset_orig()
f, ax = plt.subplots(figsize=(5, 4))
ax = sns.regplot(x='Unmarried', y='Infant_Mortality', data=df_state)
ax.set(ylim=(0, None))
ax.set(xlim=(0, None))
ax.set_ylabel('Infant Mortality')